0. Library

library(rio)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(tseries)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.4     ✓ purrr   0.3.4
## ✓ tibble  3.1.2     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggplot2)
library(ggfortify)
## Registered S3 methods overwritten by 'ggfortify':
##   method                 from    
##   autoplot.Arima         forecast
##   autoplot.acf           forecast
##   autoplot.ar            forecast
##   autoplot.bats          forecast
##   autoplot.decomposed.ts forecast
##   autoplot.ets           forecast
##   autoplot.forecast      forecast
##   autoplot.stl           forecast
##   autoplot.ts            forecast
##   fitted.ar              forecast
##   fortify.ts             forecast
##   residuals.ar           forecast
library(zoo)
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

1. Loading Data

dataset <- read.csv(file= "owid-covid-data.csv", stringsAsFactors =FALSE, header=TRUE)

2. Exploratoy Data Analysis of dataset

dim(dataset)
## [1] 100191     60
head(dataset)
tail(dataset)
summary(dataset)
##    iso_code          continent           location             date          
##  Length:100191      Length:100191      Length:100191      Length:100191     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   total_cases          new_cases      new_cases_smoothed  total_deaths    
##  Min.   :        1   Min.   :-74347   Min.   : -6223.0   Min.   :      1  
##  1st Qu.:     1285   1st Qu.:     2   1st Qu.:     7.6   1st Qu.:     54  
##  Median :    13634   Median :    74   Median :    92.3   Median :    399  
##  Mean   :  1075182   Mean   :  6037   Mean   :  6064.2   Mean   :  29024  
##  3rd Qu.:   145379   3rd Qu.:   809   3rd Qu.:   852.1   3rd Qu.:   3846  
##  Max.   :183781831   Max.   :906017   Max.   :826388.4   Max.   :3977058  
##  NA's   :3607        NA's   :3610     NA's   :4620       NA's   :13760    
##    new_deaths      new_deaths_smoothed total_cases_per_million
##  Min.   :-1918.0   Min.   : -232.143   Min.   :     0.0       
##  1st Qu.:    0.0   1st Qu.:    0.000   1st Qu.:   262.7       
##  Median :    2.0   Median :    1.429   Median :  1820.3       
##  Mean   :  146.3   Mean   :  131.873   Mean   : 13250.9       
##  3rd Qu.:   18.0   3rd Qu.:   14.500   3rd Qu.: 13885.6       
##  Max.   :18050.0   Max.   :14737.000   Max.   :180133.3       
##  NA's   :13604     NA's   :4620        NA's   :4121           
##  new_cases_per_million new_cases_smoothed_per_million total_deaths_per_million
##  Min.   :-2153.437     Min.   :-276.825               Min.   :   0.001        
##  1st Qu.:    0.212     1st Qu.:   1.295               1st Qu.:   8.050        
##  Median :    8.453     Median :  11.193               Median :  52.015        
##  Mean   :   76.035     Mean   :  76.368               Mean   : 292.531        
##  3rd Qu.:   70.061     3rd Qu.:  78.577               3rd Qu.: 319.336        
##  Max.   :18293.675     Max.   :4083.500               Max.   :5860.454        
##  NA's   :4124          NA's   :5129                   NA's   :14261           
##  new_deaths_per_million new_deaths_smoothed_per_million reproduction_rate
##  Min.   :-76.445        Min.   :-10.921                 Min.   :-0.020   
##  1st Qu.:  0.000        1st Qu.:  0.000                 1st Qu.: 0.840   
##  Median :  0.134        Median :  0.158                 Median : 1.010   
##  Mean   :  1.551        Mean   :  1.397                 Mean   : 1.003   
##  3rd Qu.:  1.322        3rd Qu.:  1.283                 3rd Qu.: 1.170   
##  Max.   :218.329        Max.   : 63.140                 Max.   : 5.770   
##  NA's   :14105          NA's   :5129                    NA's   :19444    
##   icu_patients     icu_patients_per_million hosp_patients   
##  Min.   :    0.0   Min.   :  0.00           Min.   :     0  
##  1st Qu.:   32.0   1st Qu.:  4.79           1st Qu.:   118  
##  Median :  182.0   Median : 16.87           Median :   650  
##  Mean   : 1055.7   Mean   : 26.16           Mean   :  4589  
##  3rd Qu.:  717.5   3rd Qu.: 40.44           3rd Qu.:  2707  
##  Max.   :28889.0   Max.   :192.74           Max.   :133214  
##  NA's   :90081     NA's   :90081            NA's   :87630   
##  hosp_patients_per_million weekly_icu_admissions
##  Min.   :   0.00           Min.   :   0.00      
##  1st Qu.:  23.44           1st Qu.:  10.75      
##  Median :  79.93           Median :  51.24      
##  Mean   : 167.67           Mean   : 272.98      
##  3rd Qu.: 243.32           3rd Qu.: 220.33      
##  Max.   :1532.57           Max.   :4002.46      
##  NA's   :87630             NA's   :99307        
##  weekly_icu_admissions_per_million weekly_hosp_admissions
##  Min.   :  0.00                    Min.   :     0        
##  1st Qu.:  1.68                    1st Qu.:    50        
##  Median :  8.37                    Median :   320        
##  Mean   : 20.49                    Mean   :  3515        
##  3rd Qu.: 22.68                    3rd Qu.:  1694        
##  Max.   :279.41                    Max.   :116323        
##  NA's   :99307                     NA's   :98622         
##  weekly_hosp_admissions_per_million   new_tests        total_tests       
##  Min.   :   0.00                    Min.   :-239172   Min.   :        0  
##  1st Qu.:   9.35                    1st Qu.:   1658   1st Qu.:   162492  
##  Median :  40.56                    Median :   6201   Median :   831234  
##  Mean   : 107.90                    Mean   :  48049   Mean   :  7823804  
##  3rd Qu.: 123.39                    3rd Qu.:  23961   3rd Qu.:  3476125  
##  Max.   :2656.91                    Max.   :3740296   Max.   :468269982  
##  NA's   :98622                      NA's   :55294     NA's   :55620      
##  total_tests_per_thousand new_tests_per_thousand new_tests_smoothed
##  Min.   :   0.00          Min.   :-23.01         Min.   :      0   
##  1st Qu.:  14.39          1st Qu.:  0.14         1st Qu.:   1741   
##  Median :  72.44          Median :  0.63         Median :   6576   
##  Mean   : 313.82          Mean   :  2.13         Mean   :  45621   
##  3rd Qu.: 297.46          3rd Qu.:  1.96         3rd Qu.:  26470   
##  Max.   :9257.57          Max.   :327.09         Max.   :3080396   
##  NA's   :55620            NA's   :55294          NA's   :47968     
##  new_tests_smoothed_per_thousand positive_rate   tests_per_case   
##  Min.   : 0.00                   Min.   :0.00    Min.   :    1.1  
##  1st Qu.: 0.14                   1st Qu.:0.02    1st Qu.:    7.7  
##  Median : 0.65                   Median :0.05    Median :   18.5  
##  Mean   : 2.04                   Mean   :0.09    Mean   :  163.5  
##  3rd Qu.: 2.01                   3rd Qu.:0.13    3rd Qu.:   57.1  
##  Max.   :90.85                   Max.   :0.93    Max.   :50000.0  
##  NA's   :47968                   NA's   :51439   NA's   :52037    
##  tests_units        total_vaccinations  people_vaccinated  
##  Length:100191      Min.   :0.000e+00   Min.   :0.000e+00  
##  Class :character   1st Qu.:1.248e+05   1st Qu.:9.474e+04  
##  Mode  :character   Median :9.494e+05   Median :6.649e+05  
##                     Mean   :3.251e+07   Mean   :1.807e+07  
##                     3rd Qu.:5.833e+06   3rd Qu.:3.913e+06  
##                     Max.   :3.221e+09   Max.   :1.889e+09  
##                     NA's   :83301       NA's   :84124      
##  people_fully_vaccinated new_vaccinations   new_vaccinations_smoothed
##  Min.   :        1       Min.   :       0   Min.   :       0         
##  1st Qu.:    51215       1st Qu.:    4562   1st Qu.:     879         
##  Median :   402035       Median :   26024   Median :    7075         
##  Mean   : 10041995       Mean   :  687973   Mean   :  333569         
##  3rd Qu.:  2313227       3rd Qu.:  142223   3rd Qu.:   44588         
##  Max.   :884442915       Max.   :47776982   Max.   :43047167         
##  NA's   :86962           NA's   :86099      NA's   :71014            
##  total_vaccinations_per_hundred people_vaccinated_per_hundred
##  Min.   :  0.00                 Min.   :  0.00               
##  1st Qu.:  2.33                 1st Qu.:  1.95               
##  Median : 11.55                 Median :  8.45               
##  Mean   : 24.57                 Mean   : 16.07               
##  3rd Qu.: 35.09                 3rd Qu.: 24.24               
##  Max.   :231.89                 Max.   :116.53               
##  NA's   :83301                  NA's   :84124                
##  people_fully_vaccinated_per_hundred new_vaccinations_smoothed_per_million
##  Min.   :  0.00                      Min.   :     0                       
##  1st Qu.:  0.96                      1st Qu.:   376                       
##  Median :  4.33                      Median :  1713                       
##  Mean   : 10.12                      Mean   :  3253                       
##  3rd Qu.: 13.20                      3rd Qu.:  4656                       
##  Max.   :115.36                      Max.   :118759                       
##  NA's   :86962                       NA's   :71014                        
##  stringency_index   population        population_density    median_age   
##  Min.   :  0.00   Min.   :4.700e+01   Min.   :    0.137   Min.   :15.10  
##  1st Qu.: 44.44   1st Qu.:2.226e+06   1st Qu.:   36.253   1st Qu.:22.20  
##  Median : 60.19   Median :9.890e+06   Median :   83.479   Median :29.90  
##  Mean   : 58.26   Mean   :1.247e+08   Mean   :  386.855   Mean   :30.56  
##  3rd Qu.: 74.54   3rd Qu.:3.481e+07   3rd Qu.:  209.588   3rd Qu.:39.10  
##  Max.   :100.00   Max.   :7.795e+09   Max.   :20546.766   Max.   :48.20  
##  NA's   :16095    NA's   :648         NA's   :7096        NA's   :10655  
##  aged_65_older    aged_70_older    gdp_per_capita     extreme_poverty
##  Min.   : 1.144   Min.   : 0.526   Min.   :   661.2   Min.   : 0.10  
##  1st Qu.: 3.466   1st Qu.: 2.043   1st Qu.:  4466.5   1st Qu.: 0.60  
##  Median : 6.378   Median : 3.871   Median : 12951.8   Median : 2.20  
##  Mean   : 8.789   Mean   : 5.564   Mean   : 19283.0   Mean   :13.41  
##  3rd Qu.:14.312   3rd Qu.: 9.167   3rd Qu.: 27216.4   3rd Qu.:21.20  
##  Max.   :27.049   Max.   :18.493   Max.   :116935.6   Max.   :77.60  
##  NA's   :11661    NA's   :11150    NA's   :10309      NA's   :39527  
##  cardiovasc_death_rate diabetes_prevalence female_smokers   male_smokers  
##  Min.   : 79.37        Min.   : 0.990      Min.   : 0.10   Min.   : 7.70  
##  1st Qu.:167.29        1st Qu.: 5.310      1st Qu.: 1.90   1st Qu.:21.60  
##  Median :242.65        Median : 7.110      Median : 6.30   Median :31.40  
##  Mean   :258.52        Mean   : 7.928      Mean   :10.57   Mean   :32.69  
##  3rd Qu.:329.63        3rd Qu.:10.080      3rd Qu.:19.30   3rd Qu.:41.10  
##  Max.   :724.42        Max.   :30.530      Max.   :44.00   Max.   :78.10  
##  NA's   :10283         NA's   :7971        NA's   :29873   NA's   :30900  
##  handwashing_facilities hospital_beds_per_thousand life_expectancy
##  Min.   :  1.19         Min.   : 0.100             Min.   :53.28  
##  1st Qu.: 19.35         1st Qu.: 1.300             1st Qu.:67.92  
##  Median : 49.84         Median : 2.400             Median :74.62  
##  Mean   : 50.83         Mean   : 3.028             Mean   :73.24  
##  3rd Qu.: 83.24         3rd Qu.: 3.861             3rd Qu.:78.74  
##  Max.   :100.00         Max.   :13.800             Max.   :86.75  
##  NA's   :55008          NA's   :18293              NA's   :5051   
##  human_development_index excess_mortality
##  Min.   :0.394           Min.   :-95.59  
##  1st Qu.:0.602           1st Qu.:  0.41  
##  Median :0.748           Median :  7.34  
##  Mean   :0.727           Mean   : 18.24  
##  3rd Qu.:0.848           3rd Qu.: 23.86  
##  Max.   :0.957           Max.   :409.69  
##  NA's   :10139           NA's   :96688
str(dataset)
## 'data.frame':    100191 obs. of  60 variables:
##  $ iso_code                             : chr  "AFG" "AFG" "AFG" "AFG" ...
##  $ continent                            : chr  "Asia" "Asia" "Asia" "Asia" ...
##  $ location                             : chr  "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
##  $ date                                 : chr  "2020-02-24" "2020-02-25" "2020-02-26" "2020-02-27" ...
##  $ total_cases                          : num  1 1 1 1 1 1 1 1 2 4 ...
##  $ new_cases                            : num  1 0 0 0 0 0 0 0 1 2 ...
##  $ new_cases_smoothed                   : num  NA NA NA NA NA 0.143 0.143 0 0.143 0.429 ...
##  $ total_deaths                         : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_deaths                           : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_deaths_smoothed                  : num  NA NA NA NA NA 0 0 0 0 0 ...
##  $ total_cases_per_million              : num  0.026 0.026 0.026 0.026 0.026 0.026 0.026 0.026 0.051 0.103 ...
##  $ new_cases_per_million                : num  0.026 0 0 0 0 0 0 0 0.026 0.051 ...
##  $ new_cases_smoothed_per_million       : num  NA NA NA NA NA 0.004 0.004 0 0.004 0.011 ...
##  $ total_deaths_per_million             : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_deaths_per_million               : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_deaths_smoothed_per_million      : num  NA NA NA NA NA 0 0 0 0 0 ...
##  $ reproduction_rate                    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ icu_patients                         : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ icu_patients_per_million             : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ hosp_patients                        : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ hosp_patients_per_million            : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ weekly_icu_admissions                : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ weekly_icu_admissions_per_million    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ weekly_hosp_admissions               : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ weekly_hosp_admissions_per_million   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_tests                            : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ total_tests                          : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ total_tests_per_thousand             : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_tests_per_thousand               : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_tests_smoothed                   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_tests_smoothed_per_thousand      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ positive_rate                        : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tests_per_case                       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tests_units                          : chr  "" "" "" "" ...
##  $ total_vaccinations                   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ people_vaccinated                    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ people_fully_vaccinated              : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_vaccinations                     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_vaccinations_smoothed            : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ total_vaccinations_per_hundred       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ people_vaccinated_per_hundred        : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ people_fully_vaccinated_per_hundred  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_vaccinations_smoothed_per_million: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ stringency_index                     : num  8.33 8.33 8.33 8.33 8.33 ...
##  $ population                           : num  38928341 38928341 38928341 38928341 38928341 ...
##  $ population_density                   : num  54.4 54.4 54.4 54.4 54.4 ...
##  $ median_age                           : num  18.6 18.6 18.6 18.6 18.6 18.6 18.6 18.6 18.6 18.6 ...
##  $ aged_65_older                        : num  2.58 2.58 2.58 2.58 2.58 ...
##  $ aged_70_older                        : num  1.34 1.34 1.34 1.34 1.34 ...
##  $ gdp_per_capita                       : num  1804 1804 1804 1804 1804 ...
##  $ extreme_poverty                      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ cardiovasc_death_rate                : num  597 597 597 597 597 ...
##  $ diabetes_prevalence                  : num  9.59 9.59 9.59 9.59 9.59 9.59 9.59 9.59 9.59 9.59 ...
##  $ female_smokers                       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ male_smokers                         : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ handwashing_facilities               : num  37.7 37.7 37.7 37.7 37.7 ...
##  $ hospital_beds_per_thousand           : num  0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
##  $ life_expectancy                      : num  64.8 64.8 64.8 64.8 64.8 ...
##  $ human_development_index              : num  0.511 0.511 0.511 0.511 0.511 0.511 0.511 0.511 0.511 0.511 ...
##  $ excess_mortality                     : num  NA NA NA NA NA NA NA NA NA NA ...
sapply(dataset, class)
##                              iso_code                             continent 
##                           "character"                           "character" 
##                              location                                  date 
##                           "character"                           "character" 
##                           total_cases                             new_cases 
##                             "numeric"                             "numeric" 
##                    new_cases_smoothed                          total_deaths 
##                             "numeric"                             "numeric" 
##                            new_deaths                   new_deaths_smoothed 
##                             "numeric"                             "numeric" 
##               total_cases_per_million                 new_cases_per_million 
##                             "numeric"                             "numeric" 
##        new_cases_smoothed_per_million              total_deaths_per_million 
##                             "numeric"                             "numeric" 
##                new_deaths_per_million       new_deaths_smoothed_per_million 
##                             "numeric"                             "numeric" 
##                     reproduction_rate                          icu_patients 
##                             "numeric"                             "numeric" 
##              icu_patients_per_million                         hosp_patients 
##                             "numeric"                             "numeric" 
##             hosp_patients_per_million                 weekly_icu_admissions 
##                             "numeric"                             "numeric" 
##     weekly_icu_admissions_per_million                weekly_hosp_admissions 
##                             "numeric"                             "numeric" 
##    weekly_hosp_admissions_per_million                             new_tests 
##                             "numeric"                             "numeric" 
##                           total_tests              total_tests_per_thousand 
##                             "numeric"                             "numeric" 
##                new_tests_per_thousand                    new_tests_smoothed 
##                             "numeric"                             "numeric" 
##       new_tests_smoothed_per_thousand                         positive_rate 
##                             "numeric"                             "numeric" 
##                        tests_per_case                           tests_units 
##                             "numeric"                           "character" 
##                    total_vaccinations                     people_vaccinated 
##                             "numeric"                             "numeric" 
##               people_fully_vaccinated                      new_vaccinations 
##                             "numeric"                             "numeric" 
##             new_vaccinations_smoothed        total_vaccinations_per_hundred 
##                             "numeric"                             "numeric" 
##         people_vaccinated_per_hundred   people_fully_vaccinated_per_hundred 
##                             "numeric"                             "numeric" 
## new_vaccinations_smoothed_per_million                      stringency_index 
##                             "numeric"                             "numeric" 
##                            population                    population_density 
##                             "numeric"                             "numeric" 
##                            median_age                         aged_65_older 
##                             "numeric"                             "numeric" 
##                         aged_70_older                        gdp_per_capita 
##                             "numeric"                             "numeric" 
##                       extreme_poverty                 cardiovasc_death_rate 
##                             "numeric"                             "numeric" 
##                   diabetes_prevalence                        female_smokers 
##                             "numeric"                             "numeric" 
##                          male_smokers                handwashing_facilities 
##                             "numeric"                             "numeric" 
##            hospital_beds_per_thousand                       life_expectancy 
##                             "numeric"                             "numeric" 
##               human_development_index                      excess_mortality 
##                             "numeric"                             "numeric"

2a). EDA - Selecting Variables & Observations

#The class of Date is character, but the class should be date.
dataset$date <- as.Date(dataset$date)


#Creating a subset with only four columns that we need.
dataSubset <- data.frame("date" = dataset$date, "location" = dataset$location, "new_cases" = dataset$new_cases, "new_deaths" = dataset$new_deaths)
dataSubset 
#The period of the study is from March 1st, 2020 to June 1st, 2021.
China<-dataSubset %>% filter (location =='China', date>= '2020-03-01',date<='2021-06-01')
US<-dataSubset %>% filter (location =='United States', date>= '2020-03-01',date<='2021-06-01')
India<-dataSubset %>% filter (location =='India', date>= '2020-03-01',date<='2021-06-01')
UK<-dataSubset %>% filter (location =='United Kingdom', date>= '2020-03-01',date<='2021-06-01')
Germany<-dataSubset %>% filter (location =='Germany', date>= '2020-03-01',date<='2021-06-01')
Belgium<-dataSubset %>% filter (location =='Belgium', date>= '2020-03-01',date<='2021-06-01')
Netherlands<-dataSubset %>% filter (location =='Netherlands', date>= '2020-03-01',date<='2021-06-01')

2b). EDA - Missing Values & Combine EU countries

#There are missing values, in the new deaths variable, before first death cases occur, so we are replacing NA to 0.
India[is.na(India)] <-0
UK[is.na(UK)] <-0
Germany[is.na(Germany)] <-0
Belgium[is.na(Belgium)] <-0
Netherlands[is.na(Netherlands)] <-0

#Combine Germany Belgium and Netherlands as EU
EUCounts <- Germany[3:4] + Belgium[3:4] + Netherlands[3:4]
EUDates <- Germany[1]
EULocation<- matrix(rep('EU',458),458,1)
colnames(EULocation) <-c('location')
EU <- cbind(EUDates,EULocation,EUCounts)

2c). EDA - Correlation between new cases and new deaths

cor(US$new_cases,US$new_deaths)
## [1] 0.7066979
cor(China$new_cases,China$new_deaths)
## [1] 0.4119176
cor(India$new_cases, India$new_deaths)
## [1] 0.9129279
cor(UK$new_cases, UK$new_deaths)
## [1] 0.5987431
cor(EU$new_cases, EU$new_deaths)
## [1] 0.5436605
#New cases and new deaths have a positive correlation in all countries. Since new cases reach zero, new deaths will also be zero. In this study we will focus on forecasting new cases only. 

2d). EDA - Negative values (false postive cases)

#Negative values are observed in new cases for China and UK. The negative values are false positives due to negative confirmatory PCR.
summary(US)
##       date              location           new_cases        new_deaths    
##  Min.   :2020-03-01   Length:458         Min.   :     7   Min.   :   0.0  
##  1st Qu.:2020-06-23   Class :character   1st Qu.: 29507   1st Qu.: 654.2  
##  Median :2020-10-15   Mode  :character   Median : 51334   Median :1014.0  
##  Mean   :2020-10-15                      Mean   : 72687   Mean   :1299.4  
##  3rd Qu.:2021-02-06                      3rd Qu.: 79390   3rd Qu.:1721.2  
##  Max.   :2021-06-01                      Max.   :300462   Max.   :4476.0
summary(China)
##       date              location           new_cases        new_deaths      
##  Min.   :2020-03-01   Length:458         Min.   : -1.00   Min.   :   0.000  
##  1st Qu.:2020-06-23   Class :character   1st Qu.:  9.00   1st Qu.:   0.000  
##  Median :2020-10-15   Mode  :character   Median : 15.00   Median :   0.000  
##  Mean   :2020-10-15                      Mean   : 26.06   Mean   :   3.932  
##  3rd Qu.:2021-02-06                      3rd Qu.: 25.00   3rd Qu.:   0.000  
##  Max.   :2021-06-01                      Max.   :575.00   Max.   :1290.000
summary(India)
##       date              location           new_cases        new_deaths    
##  Min.   :2020-03-01   Length:458         Min.   :     0   Min.   :  -1.0  
##  1st Qu.:2020-06-23   Class :character   1st Qu.: 11046   1st Qu.: 121.5  
##  Median :2020-10-15   Mode  :character   Median : 29268   Median : 415.5  
##  Mean   :2020-10-15                      Mean   : 61808   Mean   : 731.7  
##  3rd Qu.:2021-02-06                      3rd Qu.: 66932   3rd Qu.: 878.2  
##  Max.   :2021-06-01                      Max.   :414188   Max.   :4529.0
summary(UK)
##       date              location           new_cases       new_deaths    
##  Min.   :2020-03-01   Length:458         Min.   :-4787   Min.   :   0.0  
##  1st Qu.:2020-06-23   Class :character   1st Qu.: 1422   1st Qu.:  18.0  
##  Median :2020-10-15   Mode  :character   Median : 3726   Median : 101.0  
##  Mean   :2020-10-15                      Mean   : 9839   Mean   : 279.6  
##  3rd Qu.:2021-02-06                      3rd Qu.:14918   3rd Qu.: 445.8  
##  Max.   :2021-06-01                      Max.   :68192   Max.   :1826.0
summary(EU)
##       date              location           new_cases       new_deaths     
##  Min.   :2020-03-01   Length:458         Min.   :   43   Min.   :-105.00  
##  1st Qu.:2020-06-23   Class :character   1st Qu.: 1956   1st Qu.:  27.25  
##  Median :2020-10-15   Mode  :character   Median : 9395   Median : 181.00  
##  Mean   :2020-10-15                      Mean   :14051   Mean   : 287.46  
##  3rd Qu.:2021-02-06                      3rd Qu.:22378   3rd Qu.: 436.00  
##  Max.   :2021-06-01                      Max.   :61408   Max.   :1923.00
#China has a false positive case, so we will change the new case value on 2020-06-03 from -1 to 0, and reduce the new case value on 2020-06-02 from 1 to 0.
China[95,3] <- 0
China[94,3] <- 0
summary(China)
##       date              location           new_cases        new_deaths      
##  Min.   :2020-03-01   Length:458         Min.   :  0.00   Min.   :   0.000  
##  1st Qu.:2020-06-23   Class :character   1st Qu.:  9.00   1st Qu.:   0.000  
##  Median :2020-10-15   Mode  :character   Median : 15.00   Median :   0.000  
##  Mean   :2020-10-15                      Mean   : 26.06   Mean   :   3.932  
##  3rd Qu.:2021-02-06                      3rd Qu.: 25.00   3rd Qu.:   0.000  
##  Max.   :2021-06-01                      Max.   :575.00   Max.   :1290.000
#UK has thousands of false positive cases, from the beginning, and updated the data by reducing 8,010 cases on April 4, 2021.
#In order to have an accurate data, we will import the data released by UK government. (https://coronavirus.data.gov.uk/details/cases)
UK_data <- read.csv(file= "UK_data.csv", stringsAsFactors =FALSE, header=TRUE)
Subset_UK <- data.frame("date" = UK_data$date, "location" = UK_data$areaName, "new_cases" = UK_data$newCasesBySpecimenDate)
Subset_UK <- Subset_UK[order(Subset_UK$date),]
UK<-Subset_UK %>% filter (date>= '2020-03-01',date<='2021-06-01')
summary(UK)
##      date             location           new_cases    
##  Length:458         Length:458         Min.   :   22  
##  Class :character   Class :character   1st Qu.: 1466  
##  Mode  :character   Mode  :character   Median : 3753  
##                                        Mean   : 9823  
##                                        3rd Qu.:14344  
##                                        Max.   :81519

2e). EDA - Decomposition check

ts_US <- ts(US$new_cases, frequency = 7)
ts_US
## Time Series:
## Start = c(1, 1) 
## End = c(66, 3) 
## Frequency = 7 
##   [1]      7     23     19     33     77     53    166    116     75    188
##  [11]    365    439    633    759    234   1467   1833   2657   4494   6367
##  [21]   5995   8873  11238  10619  12082  17856  18690  19630  18899  22075
##  [31]  26314  32286  32222  32307  32386  29895  31390  30779  31235  35942
##  [41]  34403  29083  27256  26939  28818  25408  30002  33066  27907  26062
##  [51]  29820  25926  28911  33612  32327  30380  26488  23715  24575  26459
##  [61]  29205  34907  27348  24321  24068  24528  24569  27429  26830  25176
##  [71]  18805  19298  22927  20433  26817  24733  24135  18388  22389  20979
##  [81]  22725  25769  23649  21099  20077  18662  19663  18551  22320  24478
##  [91]  23632  18983  17414  21502  19857  21658  25403  21142  17928  17616
## [101]  18383  21111  23166  24828  25212  18941  19827  23667  27072  28537
## [111]  31544  32280  25150  32144  37072  35878  40326  45999  41330  40734
## [121]  41299  46420  51820  56636  51351  45693  50747  43106  60661  60119
## [131]  62509  68057  60017  58424  58930  68051  68090  75866  72236  62536
## [141]  60456  62107  64461  70579  68445  73319  64953  54816  56783  66412
## [151]  71873  67466  68712  56201  45568  45501  58774  54466  59349  59303
## [161]  54134  45769  47606  48003  56040  51316  65336  46951  39212  36652
## [171]  45033  47320  44041  48843  43064  34247  36482  40354  45172  45393
## [181]  46832  42754  34404  35362  41846  41018  44251  50382  43158  31218
## [191]  23613  27226  34049  36056  47769  41101  34372  34423  39492  39030
## [201]  45130  49282  42202  38447  51878  39866  39069  47103  48295  44673
## [211]  37528  33294  43338  39443  45650  54950  48577  35738  39376  45289
## [221]  51056  58599  56384  54953  45981  41774  52231  59778  64880  69147
## [231]  56759  49367  67717  61956  63301  76310  81933  82767  62204  67310
## [241]  76838  79426  91019  99264  89754 104900  85211 127198 104541 129367
## [251] 128033 127510 115170 120434 140504 146750 164750 180398 167926 136356
## [261] 162624 163958 173251 191510 198333 179436 146947 174089 175544 183403
## [271] 112526 207933 155661 140328 160321 188339 202894 223421 232644 215811
## [281] 181253 194409 224452 222648 231428 240089 217797 187971 194314 208992
## [291] 246644 239900 251832 192102 188015 198703 198107 229624 194155  97880
## [301] 226416 155845 174152 200408 233701 235600 153916 300462 208853 184005
## [311] 235042 255637 278337 295257 260967 213415 214664 226967 230301 235766
## [321] 242780 201858 177931 143598 176216 183261 193856 190760 170759 131198
## [331] 151677 147626 153961 168804 166613 142459 112152 134975 115303 121691
## [341] 124006 134422 104176  89746  90438  95265  95250 105764  99670  87219
## [351]  65135  54279  62498  70139  69911  79282  71696  57152  56159  72270
## [361]  74749  77504  77349  64626  51422  58098  57098  67191  68060  66419
## [371]  58254  41073  44917  57667  57895  62471  61513  53031  38278  56541
## [381]  54008  59280  60375  61651  55519  33822  51436  53688  86938  67546
## [391]  77377  62842  43223  69273  61429  66765  79119  69887  63261  35133
## [401]  77404  60674  75021  79894  82710  66687  46509  70075  77966  75403
## [411]  74308  80071  52532  42121  68105  60949  62855  67278  62411  53495
## [421]  32153  47568  50836  55150  58251  57919  45391  29403  50491  40723
## [431]  44704  47557  48130  33675  21427  36798  33662  35826  38076  42260
## [441]  28857  16875  28621  27789  29301  30206  27946  19798  12868  25815
## [451]  22739  23976  27448  21859  12001   6734   5775  22940
decompose_US <- decompose(ts_US, "additive")
plot(decompose_US)

ts_China <- ts(China$new_cases, frequency = 7)
decompose_China <- decompose(ts_China, "additive")
plot(decompose_China)

ts_India <- ts(India$new_cases, frequency = 7)
decompose_India <- decompose(ts_India, "additive")
plot(decompose_India)

ts_UK <- ts(UK$new_cases, frequency = 7)
decompose_UK <- decompose(ts_UK, "additive")
plot(decompose_UK)

ts_EU <- ts(EU$new_cases, frequency = 7)
decompose_EU <- decompose(ts_EU, "additive")
plot(decompose_EU)

#### 2f). EDA - Stationary check

acf(ts_US)

#ACF is decaying slowly and remains above the significance range (blue lines), which means it is non-stationary

adf.test(as.matrix(ts_US))
## 
##  Augmented Dickey-Fuller Test
## 
## data:  as.matrix(ts_US)
## Dickey-Fuller = -0.51593, Lag order = 7, p-value = 0.981
## alternative hypothesis: stationary
#Conducting Augmented Dickey-Fuller Test to check if the data is stationary
#p-value is greater than 0.05, which means it is non-stationary again.

acf(ts_China, lag.max = 34)

adf.test(as.matrix(ts_China))
## Warning in adf.test(as.matrix(ts_China)): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  as.matrix(ts_China)
## Dickey-Fuller = -4.8024, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
#ts_China is stationary

acf(ts_India, lag.max = 34)

adf.test(as.matrix(ts_India))
## Warning in adf.test(as.matrix(ts_India)): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  as.matrix(ts_India)
## Dickey-Fuller = -4.5689, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
#ts_India seems non-stationary from its ACF, but passed the ADF test

acf(ts_UK)

adf.test(as.matrix(ts_UK))
## 
##  Augmented Dickey-Fuller Test
## 
## data:  as.matrix(ts_UK)
## Dickey-Fuller = -1.3237, Lag order = 7, p-value = 0.8639
## alternative hypothesis: stationary
#ts_UK is non-stationary

acf(ts_EU)

adf.test(as.matrix(ts_EU))
## 
##  Augmented Dickey-Fuller Test
## 
## data:  as.matrix(ts_EU)
## Dickey-Fuller = -0.37683, Lag order = 7, p-value = 0.9873
## alternative hypothesis: stationary
#ts_EU is non-stationary

3a). Modelling - US

#finding the best ARIMA model with respect to AIC value
model_US <- auto.arima(ts_US, ic="aic", trace =TRUE, approximation = F)
## 
##  ARIMA(2,1,2)(1,0,1)[7] with drift         : 10037.12
##  ARIMA(0,1,0)           with drift         : 10273.22
##  ARIMA(1,1,0)(1,0,0)[7] with drift         : 10077.63
##  ARIMA(0,1,1)(0,0,1)[7] with drift         : 10058.91
##  ARIMA(0,1,0)                              : 10271.23
##  ARIMA(2,1,2)(0,0,1)[7] with drift         : 10063.56
##  ARIMA(2,1,2)(1,0,0)[7] with drift         : Inf
##  ARIMA(2,1,2)(2,0,1)[7] with drift         : 10048.39
##  ARIMA(2,1,2)(1,0,2)[7] with drift         : Inf
##  ARIMA(2,1,2)           with drift         : Inf
##  ARIMA(2,1,2)(0,0,2)[7] with drift         : 10059.9
##  ARIMA(2,1,2)(2,0,0)[7] with drift         : Inf
##  ARIMA(2,1,2)(2,0,2)[7] with drift         : Inf
##  ARIMA(1,1,2)(1,0,1)[7] with drift         : Inf
##  ARIMA(2,1,1)(1,0,1)[7] with drift         : 10035.15
##  ARIMA(2,1,1)(0,0,1)[7] with drift         : 10061.56
##  ARIMA(2,1,1)(1,0,0)[7] with drift         : 10046.47
##  ARIMA(2,1,1)(2,0,1)[7] with drift         : 10048.29
##  ARIMA(2,1,1)(1,0,2)[7] with drift         : Inf
##  ARIMA(2,1,1)           with drift         : 10167.96
##  ARIMA(2,1,1)(0,0,2)[7] with drift         : 10057.93
##  ARIMA(2,1,1)(2,0,0)[7] with drift         : 10047.81
##  ARIMA(2,1,1)(2,0,2)[7] with drift         : Inf
##  ARIMA(1,1,1)(1,0,1)[7] with drift         : 10036.39
##  ARIMA(2,1,0)(1,0,1)[7] with drift         : 10037.41
##  ARIMA(3,1,1)(1,0,1)[7] with drift         : 10037.13
##  ARIMA(1,1,0)(1,0,1)[7] with drift         : 10078.3
##  ARIMA(3,1,0)(1,0,1)[7] with drift         : 10035.81
##  ARIMA(3,1,2)(1,0,1)[7] with drift         : Inf
##  ARIMA(2,1,1)(1,0,1)[7]                    : 10033.15
##  ARIMA(2,1,1)(0,0,1)[7]                    : 10059.56
##  ARIMA(2,1,1)(1,0,0)[7]                    : 10044.47
##  ARIMA(2,1,1)(2,0,1)[7]                    : 10046.29
##  ARIMA(2,1,1)(1,0,2)[7]                    : Inf
##  ARIMA(2,1,1)                              : 10165.97
##  ARIMA(2,1,1)(0,0,2)[7]                    : 10055.94
##  ARIMA(2,1,1)(2,0,0)[7]                    : Inf
##  ARIMA(2,1,1)(2,0,2)[7]                    : Inf
##  ARIMA(1,1,1)(1,0,1)[7]                    : 10034.39
##  ARIMA(2,1,0)(1,0,1)[7]                    : 10035.41
##  ARIMA(3,1,1)(1,0,1)[7]                    : 10035.13
##  ARIMA(2,1,2)(1,0,1)[7]                    : 10035.12
##  ARIMA(1,1,0)(1,0,1)[7]                    : 10076.3
##  ARIMA(1,1,2)(1,0,1)[7]                    : Inf
##  ARIMA(3,1,0)(1,0,1)[7]                    : 10033.81
##  ARIMA(3,1,2)(1,0,1)[7]                    : Inf
## 
##  Best model: ARIMA(2,1,1)(1,0,1)[7]
model_US
## Series: ts_US 
## ARIMA(2,1,1)(1,0,1)[7] 
## 
## Coefficients:
##           ar1      ar2      ma1    sar1     sma1
##       -0.4015  -0.1649  -0.3093  0.9218  -0.6703
## s.e.   0.1357   0.0888   0.1334  0.0376   0.0894
## 
## sigma^2 estimated as 195556347:  log likelihood=-5010.57
## AIC=10033.15   AICc=10033.34   BIC=10057.9
#checking stationary after seasonal differencing
acf(ts(model_US$residuals))

pacf(ts(model_US$residuals))

#ACF shows exponential decays and indicates it is stationary

#forecasting 1 month ahead with a 95% interval
forecast_US<-forecast(model_US, h=4, level=c(95))
forecast_US
##          Point Forecast     Lo 95    Hi 95
## 66.42857      15676.158 -11732.27 43084.58
## 66.57143      18057.317 -10474.36 46588.99
## 66.71429      18069.088 -12592.22 48730.39
## 66.85714       8893.088 -24455.61 42241.79
plot(forecast_US)

summary(forecast_US)
## 
## Forecast method: ARIMA(2,1,1)(1,0,1)[7]
## 
## Model Information:
## Series: ts_US 
## ARIMA(2,1,1)(1,0,1)[7] 
## 
## Coefficients:
##           ar1      ar2      ma1    sar1     sma1
##       -0.4015  -0.1649  -0.3093  0.9218  -0.6703
## s.e.   0.1357   0.0888   0.1334  0.0376   0.0894
## 
## sigma^2 estimated as 195556347:  log likelihood=-5010.57
## AIC=10033.15   AICc=10033.34   BIC=10057.9
## 
## Error measures:
##                     ME     RMSE      MAE       MPE     MAPE      MASE
## Training set -66.87734 13892.25 7090.738 0.5004233 11.76567 0.5716284
##                      ACF1
## Training set -0.001105334
## 
## Forecasts:
##          Point Forecast     Lo 95    Hi 95
## 66.42857      15676.158 -11732.27 43084.58
## 66.57143      18057.317 -10474.36 46588.99
## 66.71429      18069.088 -12592.22 48730.39
## 66.85714       8893.088 -24455.61 42241.79
accuracy(forecast_US)
##                     ME     RMSE      MAE       MPE     MAPE      MASE
## Training set -66.87734 13892.25 7090.738 0.5004233 11.76567 0.5716284
##                      ACF1
## Training set -0.001105334
#actual new cases in US on July 1, 2021
(dataset %>% filter (location =='United States', date=='2021-07-01'))[6]
#14463
#difference between the actual and predicted
abs(14463 - 8893.088)
## [1] 5569.912

3b). Modeling - China

#finding the best ARIMA model with respect to AIC value
model_China <- auto.arima(ts_China, ic="aic", trace =TRUE, approximation = F)
## 
##  ARIMA(2,1,2)(1,0,1)[7] with drift         : 4367.799
##  ARIMA(0,1,0)           with drift         : 4439.068
##  ARIMA(1,1,0)(1,0,0)[7] with drift         : 4404.438
##  ARIMA(0,1,1)(0,0,1)[7] with drift         : 4388.941
##  ARIMA(0,1,0)                              : 4437.759
##  ARIMA(2,1,2)(0,0,1)[7] with drift         : 4366.334
##  ARIMA(2,1,2)           with drift         : 4366.819
##  ARIMA(2,1,2)(0,0,2)[7] with drift         : 4367.357
##  ARIMA(2,1,2)(1,0,0)[7] with drift         : 4366.043
##  ARIMA(2,1,2)(2,0,0)[7] with drift         : 4367.607
##  ARIMA(2,1,2)(2,0,1)[7] with drift         : 4368.416
##  ARIMA(1,1,2)(1,0,0)[7] with drift         : 4372.232
##  ARIMA(2,1,1)(1,0,0)[7] with drift         : 4390.203
##  ARIMA(3,1,2)(1,0,0)[7] with drift         : 4367.07
##  ARIMA(2,1,3)(1,0,0)[7] with drift         : 4367.543
##  ARIMA(1,1,1)(1,0,0)[7] with drift         : 4390.926
##  ARIMA(1,1,3)(1,0,0)[7] with drift         : 4365.681
##  ARIMA(1,1,3)           with drift         : 4366.188
##  ARIMA(1,1,3)(2,0,0)[7] with drift         : 4367.359
##  ARIMA(1,1,3)(1,0,1)[7] with drift         : 4367.502
##  ARIMA(1,1,3)(0,0,1)[7] with drift         : 4365.928
##  ARIMA(1,1,3)(2,0,1)[7] with drift         : Inf
##  ARIMA(0,1,3)(1,0,0)[7] with drift         : 4381.777
##  ARIMA(1,1,4)(1,0,0)[7] with drift         : 4367.255
##  ARIMA(0,1,2)(1,0,0)[7] with drift         : 4390.916
##  ARIMA(0,1,4)(1,0,0)[7] with drift         : 4369.33
##  ARIMA(2,1,4)(1,0,0)[7] with drift         : 4368.988
##  ARIMA(1,1,3)(1,0,0)[7]                    : 4364.613
##  ARIMA(1,1,3)                              : 4365.13
##  ARIMA(1,1,3)(2,0,0)[7]                    : 4366.281
##  ARIMA(1,1,3)(1,0,1)[7]                    : 4366.43
##  ARIMA(1,1,3)(0,0,1)[7]                    : 4364.863
##  ARIMA(1,1,3)(2,0,1)[7]                    : 4366.94
##  ARIMA(0,1,3)(1,0,0)[7]                    : 4380.931
##  ARIMA(1,1,2)(1,0,0)[7]                    : 4371.135
##  ARIMA(2,1,3)(1,0,0)[7]                    : 4366.483
##  ARIMA(1,1,4)(1,0,0)[7]                    : 4366.213
##  ARIMA(0,1,2)(1,0,0)[7]                    : 4390.109
##  ARIMA(0,1,4)(1,0,0)[7]                    : 4368.414
##  ARIMA(2,1,2)(1,0,0)[7]                    : 4364.992
##  ARIMA(2,1,4)(1,0,0)[7]                    : 4367.944
## 
##  Best model: ARIMA(1,1,3)(1,0,0)[7]
model_China
## Series: ts_China 
## ARIMA(1,1,3)(1,0,0)[7] 
## 
## Coefficients:
##          ar1      ma1     ma2     ma3     sar1
##       0.8121  -1.3398  0.3813  0.1832  -0.1093
## s.e.  0.0962   0.0982  0.1103  0.0605   0.0687
## 
## sigma^2 estimated as 808.6:  log likelihood=-2176.31
## AIC=4364.61   AICc=4364.8   BIC=4389.36
#checking stationary after seasonal differencing
acf(ts(model_China$residuals))

pacf(ts(model_China$residuals))

#ACF shows exponential decays and indicates it is stationary

#forecasting 1 month ahead with a 95% interval
forecast_China<-forecast(model_China, h=4, level=c(95))
forecast_China
##          Point Forecast     Lo 95     Hi 95
## 66.42857       22.69106 -33.04218  78.42429
## 66.57143       25.92905 -35.70845  87.56655
## 66.71429       27.30238 -38.73258  93.33734
## 66.85714       29.76334 -43.51591 103.04259
plot(forecast_China)

summary(forecast_China)
## 
## Forecast method: ARIMA(1,1,3)(1,0,0)[7]
## 
## Model Information:
## Series: ts_China 
## ARIMA(1,1,3)(1,0,0)[7] 
## 
## Coefficients:
##          ar1      ma1     ma2     ma3     sar1
##       0.8121  -1.3398  0.3813  0.1832  -0.1093
## s.e.  0.0962   0.0982  0.1103  0.0605   0.0687
## 
## sigma^2 estimated as 808.6:  log likelihood=-2176.31
## AIC=4364.61   AICc=4364.8   BIC=4389.36
## 
## Error measures:
##                     ME     RMSE      MAE  MPE MAPE     MASE      ACF1
## Training set -1.044764 28.24897 11.63521 -Inf  Inf 0.658569 0.1718338
## 
## Forecasts:
##          Point Forecast     Lo 95     Hi 95
## 66.42857       22.69106 -33.04218  78.42429
## 66.57143       25.92905 -35.70845  87.56655
## 66.71429       27.30238 -38.73258  93.33734
## 66.85714       29.76334 -43.51591 103.04259
accuracy(forecast_China)
##                     ME     RMSE      MAE  MPE MAPE     MASE      ACF1
## Training set -1.044764 28.24897 11.63521 -Inf  Inf 0.658569 0.1718338
#actual new cases in China on July 1, 2021
(dataset %>% filter (location =='China', date=='2021-07-01'))[6]
#18
#difference between the actual and predicted
abs(18 - 29.76)
## [1] 11.76

3c). Modeling - India

#finding the best ARIMA model with respect to AIC value
model_India <- auto.arima(ts_India, ic="aic", trace =TRUE, approximation = FALSE)
## 
##  ARIMA(2,0,2)(1,1,1)[7] with drift         : Inf
##  ARIMA(0,0,0)(0,1,0)[7] with drift         : 10456.23
##  ARIMA(1,0,0)(1,1,0)[7] with drift         : 9165.984
##  ARIMA(0,0,1)(0,1,1)[7] with drift         : 9868.198
##  ARIMA(0,0,0)(0,1,0)[7]                    : 10458.52
##  ARIMA(1,0,0)(0,1,0)[7] with drift         : 9203.874
##  ARIMA(1,0,0)(2,1,0)[7] with drift         : Inf
##  ARIMA(1,0,0)(1,1,1)[7] with drift         : Inf
##  ARIMA(1,0,0)(0,1,1)[7] with drift         : Inf
##  ARIMA(1,0,0)(2,1,1)[7] with drift         : Inf
##  ARIMA(0,0,0)(1,1,0)[7] with drift         : 10030.14
##  ARIMA(2,0,0)(1,1,0)[7] with drift         : Inf
##  ARIMA(1,0,1)(1,1,0)[7] with drift         : Inf
##  ARIMA(0,0,1)(1,1,0)[7] with drift         : 9766.506
##  ARIMA(2,0,1)(1,1,0)[7] with drift         : Inf
##  ARIMA(1,0,0)(1,1,0)[7]                    : 9164.187
##  ARIMA(1,0,0)(0,1,0)[7]                    : 9201.967
##  ARIMA(1,0,0)(2,1,0)[7]                    : Inf
##  ARIMA(1,0,0)(1,1,1)[7]                    : Inf
##  ARIMA(1,0,0)(0,1,1)[7]                    : Inf
##  ARIMA(1,0,0)(2,1,1)[7]                    : Inf
##  ARIMA(0,0,0)(1,1,0)[7]                    : 10028.51
##  ARIMA(2,0,0)(1,1,0)[7]                    : Inf
##  ARIMA(1,0,1)(1,1,0)[7]                    : Inf
##  ARIMA(0,0,1)(1,1,0)[7]                    : 9764.541
##  ARIMA(2,0,1)(1,1,0)[7]                    : Inf
## 
##  Best model: ARIMA(1,0,0)(1,1,0)[7]
model_India
## Series: ts_India 
## ARIMA(1,0,0)(1,1,0)[7] 
## 
## Coefficients:
##          ar1     sar1
##       0.9878  -0.2954
## s.e.  0.0082   0.0454
## 
## sigma^2 estimated as 38430875:  log likelihood=-4579.09
## AIC=9164.19   AICc=9164.24   BIC=9176.52
#checking stationary after seasonal differencing
acf(ts(model_India$residuals))

pacf(ts(model_India$residuals))

#ACF shows exponential decays and indicates it is stationary

#forecasting 1 month ahead with a 95% interval
forecast_India<-forecast(model_India, h=4, level=c(95))
forecast_India
##          Point Forecast     Lo 95    Hi 95
## 66.42857      138196.43 126046.10 150346.8
## 66.57143      116863.73  99785.19 133942.3
## 66.71429      108452.48  87662.49 129242.5
## 66.85714       98886.52  75025.27 122747.8
plot(forecast_India)

summary(forecast_India)
## 
## Forecast method: ARIMA(1,0,0)(1,1,0)[7]
## 
## Model Information:
## Series: ts_India 
## ARIMA(1,0,0)(1,1,0)[7] 
## 
## Coefficients:
##          ar1     sar1
##       0.9878  -0.2954
## s.e.  0.0082   0.0454
## 
## sigma^2 estimated as 38430875:  log likelihood=-4579.09
## AIC=9164.19   AICc=9164.24   BIC=9176.52
## 
## Error measures:
##                     ME     RMSE      MAE  MPE MAPE      MASE       ACF1
## Training set -157.6634 6138.052 3382.382 -Inf  Inf 0.2670619 -0.1550257
## 
## Forecasts:
##          Point Forecast     Lo 95    Hi 95
## 66.42857      138196.43 126046.10 150346.8
## 66.57143      116863.73  99785.19 133942.3
## 66.71429      108452.48  87662.49 129242.5
## 66.85714       98886.52  75025.27 122747.8
accuracy(forecast_India)
##                     ME     RMSE      MAE  MPE MAPE      MASE       ACF1
## Training set -157.6634 6138.052 3382.382 -Inf  Inf 0.2670619 -0.1550257
#actual new cases in India on July 1, 2021
(dataset %>% filter (location =='India', date=='2021-07-01'))[6]
#46617
#difference between the actual and predicted
abs(46617 -98886.52)
## [1] 52269.52

3d). Modeling - UK

#finding the best ARIMA model with respect to AIC value
model_UK <- auto.arima(ts_UK, ic="aic", trace =TRUE, approximation = F)
## 
##  ARIMA(2,1,2)(1,0,1)[7] with drift         : 8746.007
##  ARIMA(0,1,0)           with drift         : 8940.362
##  ARIMA(1,1,0)(1,0,0)[7] with drift         : 8790.661
##  ARIMA(0,1,1)(0,0,1)[7] with drift         : 8793.067
##  ARIMA(0,1,0)                              : 8938.365
##  ARIMA(2,1,2)(0,0,1)[7] with drift         : 8751.76
##  ARIMA(2,1,2)(1,0,0)[7] with drift         : 8746.171
##  ARIMA(2,1,2)(2,0,1)[7] with drift         : Inf
##  ARIMA(2,1,2)(1,0,2)[7] with drift         : 8738.516
##  ARIMA(2,1,2)(0,0,2)[7] with drift         : 8763.425
##  ARIMA(2,1,2)(2,0,2)[7] with drift         : 8739.364
##  ARIMA(1,1,2)(1,0,2)[7] with drift         : 8737.044
##  ARIMA(1,1,2)(0,0,2)[7] with drift         : 8761.517
##  ARIMA(1,1,2)(1,0,1)[7] with drift         : 8745.039
##  ARIMA(1,1,2)(2,0,2)[7] with drift         : 8738.015
##  ARIMA(1,1,2)(0,0,1)[7] with drift         : 8775.854
##  ARIMA(1,1,2)(2,0,1)[7] with drift         : Inf
##  ARIMA(0,1,2)(1,0,2)[7] with drift         : 8735.29
##  ARIMA(0,1,2)(0,0,2)[7] with drift         : 8759.553
##  ARIMA(0,1,2)(1,0,1)[7] with drift         : 8743.449
##  ARIMA(0,1,2)(2,0,2)[7] with drift         : 8736.319
##  ARIMA(0,1,2)(0,0,1)[7] with drift         : 8774.647
##  ARIMA(0,1,2)(2,0,1)[7] with drift         : Inf
##  ARIMA(0,1,1)(1,0,2)[7] with drift         : 8740.985
##  ARIMA(0,1,3)(1,0,2)[7] with drift         : 8736.885
##  ARIMA(1,1,1)(1,0,2)[7] with drift         : 8736.771
##  ARIMA(1,1,3)(1,0,2)[7] with drift         : Inf
##  ARIMA(0,1,2)(1,0,2)[7]                    : 8733.297
##  ARIMA(0,1,2)(0,0,2)[7]                    : 8757.563
##  ARIMA(0,1,2)(1,0,1)[7]                    : 8741.457
##  ARIMA(0,1,2)(2,0,2)[7]                    : 8734.324
##  ARIMA(0,1,2)(0,0,1)[7]                    : 8772.657
##  ARIMA(0,1,2)(2,0,1)[7]                    : Inf
##  ARIMA(0,1,1)(1,0,2)[7]                    : 8738.99
##  ARIMA(1,1,2)(1,0,2)[7]                    : 8735.05
##  ARIMA(0,1,3)(1,0,2)[7]                    : 8734.891
##  ARIMA(1,1,1)(1,0,2)[7]                    : 8734.778
##  ARIMA(1,1,3)(1,0,2)[7]                    : Inf
## 
##  Best model: ARIMA(0,1,2)(1,0,2)[7]
model_UK
## Series: ts_UK 
## ARIMA(0,1,2)(1,0,2)[7] 
## 
## Coefficients:
##           ma1      ma2    sar1     sma1     sma2
##       -0.4246  -0.1336  0.9075  -0.4345  -0.2515
## s.e.   0.0481   0.0465  0.0424   0.0678   0.0562
## 
## sigma^2 estimated as 11405895:  log likelihood=-4360.65
## AIC=8733.3   AICc=8733.48   BIC=8758.05
#checking stationary after seasonal differencing
acf(ts(model_UK$residuals))

pacf(ts(model_UK$residuals))

#ACF shows exponential decays and indicates it is stationary

#forecasting 1 month ahead with a 95% interval
forecast_UK<-forecast(model_UK, h=4, level=c(95))
forecast_UK
##          Point Forecast     Lo 95    Hi 95
## 66.42857       4857.418 -1761.893 11476.73
## 66.57143       4508.709 -3128.030 12145.45
## 66.71429       4333.070 -3844.400 12510.54
## 66.85714       3948.322 -4736.275 12632.92
plot(forecast_UK)

summary(forecast_UK)
## 
## Forecast method: ARIMA(0,1,2)(1,0,2)[7]
## 
## Model Information:
## Series: ts_UK 
## ARIMA(0,1,2)(1,0,2)[7] 
## 
## Coefficients:
##           ma1      ma2    sar1     sma1     sma2
##       -0.4246  -0.1336  0.9075  -0.4345  -0.2515
## s.e.   0.0481   0.0465  0.0424   0.0678   0.0562
## 
## sigma^2 estimated as 11405895:  log likelihood=-4360.65
## AIC=8733.3   AICc=8733.48   BIC=8758.05
## 
## Error measures:
##                    ME     RMSE      MAE      MPE     MAPE      MASE
## Training set 8.504075 3355.067 1231.412 1.607446 11.84683 0.4768903
##                      ACF1
## Training set -0.005361969
## 
## Forecasts:
##          Point Forecast     Lo 95    Hi 95
## 66.42857       4857.418 -1761.893 11476.73
## 66.57143       4508.709 -3128.030 12145.45
## 66.71429       4333.070 -3844.400 12510.54
## 66.85714       3948.322 -4736.275 12632.92
accuracy(forecast_UK)
##                    ME     RMSE      MAE      MPE     MAPE      MASE
## Training set 8.504075 3355.067 1231.412 1.607446 11.84683 0.4768903
##                      ACF1
## Training set -0.005361969
#actual new cases in UK on July 1, 2021 is 28071
#difference between the actual and predicted
abs(28071 - 3944.238)
## [1] 24126.76

3e). Modeling - EU

#finding the best ARIMA model with respect to AIC value
model_EU <- auto.arima(ts_EU, ic="aic", trace =TRUE, approximation = F)
## 
##  ARIMA(2,0,2)(1,1,1)[7] with drift         : 9002.91
##  ARIMA(0,0,0)(0,1,0)[7] with drift         : 9198.97
##  ARIMA(1,0,0)(1,1,0)[7] with drift         : 9153.132
##  ARIMA(0,0,1)(0,1,1)[7] with drift         : 9167.494
##  ARIMA(0,0,0)(0,1,0)[7]                    : 9197.177
##  ARIMA(2,0,2)(0,1,1)[7] with drift         : 9007.764
##  ARIMA(2,0,2)(1,1,0)[7] with drift         : 9055.255
##  ARIMA(2,0,2)(2,1,1)[7] with drift         : 9002.596
##  ARIMA(2,0,2)(2,1,0)[7] with drift         : 8987.179
##  ARIMA(1,0,2)(2,1,0)[7] with drift         : 9000.174
##  ARIMA(2,0,1)(2,1,0)[7] with drift         : 9000.452
##  ARIMA(3,0,2)(2,1,0)[7] with drift         : 9004.215
##  ARIMA(2,0,3)(2,1,0)[7] with drift         : 9002.359
##  ARIMA(1,0,1)(2,1,0)[7] with drift         : 9001.996
##  ARIMA(1,0,3)(2,1,0)[7] with drift         : 9001.645
##  ARIMA(3,0,1)(2,1,0)[7] with drift         : 9002.164
##  ARIMA(3,0,3)(2,1,0)[7] with drift         : 8998.186
##  ARIMA(2,0,2)(2,1,0)[7]                    : 8985.228
##  ARIMA(2,0,2)(1,1,0)[7]                    : 9053.309
##  ARIMA(2,0,2)(2,1,1)[7]                    : 9000.598
##  ARIMA(2,0,2)(1,1,1)[7]                    : 9000.977
##  ARIMA(1,0,2)(2,1,0)[7]                    : 8998.178
##  ARIMA(2,0,1)(2,1,0)[7]                    : 8998.456
##  ARIMA(3,0,2)(2,1,0)[7]                    : 9002.202
##  ARIMA(2,0,3)(2,1,0)[7]                    : 9000.362
##  ARIMA(1,0,1)(2,1,0)[7]                    : 8999.998
##  ARIMA(1,0,3)(2,1,0)[7]                    : 8999.651
##  ARIMA(3,0,1)(2,1,0)[7]                    : 9000.169
##  ARIMA(3,0,3)(2,1,0)[7]                    : 8996.191
## 
##  Best model: ARIMA(2,0,2)(2,1,0)[7]
model_EU
## Series: ts_EU 
## ARIMA(2,0,2)(2,1,0)[7] 
## 
## Coefficients:
##          ar1      ar2      ma1     ma2     sar1     sar2
##       1.8627  -0.8763  -1.6835  0.7546  -0.5989  -0.4088
## s.e.  0.0733   0.0698   0.0591  0.0404   0.0447   0.0451
## 
## sigma^2 estimated as 25618041:  log likelihood=-4485.61
## AIC=8985.23   AICc=8985.48   BIC=9014.01
#checking stationary after seasonal differencing
acf(ts(model_EU$residuals))

pacf(ts(model_EU$residuals))

#ACF shows exponential decays and indicates it is stationary

#forecasting 1 month ahead with a 95% interval
forecast_EU<-forecast(model_EU, h=4, level=c(95))
forecast_EU
##          Point Forecast       Lo 95    Hi 95
## 66.42857       8306.906 -1613.30826 18227.12
## 66.57143      12722.398  2644.21012 22800.59
## 66.71429      10265.902   -29.35596 20561.16
## 66.85714       7650.100 -2912.08897 18212.29
plot(forecast_EU)

summary(forecast_EU)
## 
## Forecast method: ARIMA(2,0,2)(2,1,0)[7]
## 
## Model Information:
## Series: ts_EU 
## ARIMA(2,0,2)(2,1,0)[7] 
## 
## Coefficients:
##          ar1      ar2      ma1     ma2     sar1     sar2
##       1.8627  -0.8763  -1.6835  0.7546  -0.5989  -0.4088
## s.e.  0.0733   0.0698   0.0591  0.0404   0.0447   0.0451
## 
## sigma^2 estimated as 25618041:  log likelihood=-4485.61
## AIC=8985.23   AICc=8985.48   BIC=9014.01
## 
## Error measures:
##                    ME     RMSE      MAE      MPE    MAPE      MASE        ACF1
## Training set 57.60673 4989.077 2799.547 -2.88599 23.6379 0.7205025 0.006634856
## 
## Forecasts:
##          Point Forecast       Lo 95    Hi 95
## 66.42857       8306.906 -1613.30826 18227.12
## 66.57143      12722.398  2644.21012 22800.59
## 66.71429      10265.902   -29.35596 20561.16
## 66.85714       7650.100 -2912.08897 18212.29
accuracy(forecast_EU)
##                    ME     RMSE      MAE      MPE    MAPE      MASE        ACF1
## Training set 57.60673 4989.077 2799.547 -2.88599 23.6379 0.7205025 0.006634856
#actual new cases in EU on July 1, 2021 is 2337
(dataset %>% filter (location =='Belgium', date=='2021-07-01'))[6] +(dataset %>% filter (location =='Netherlands', date=='2021-07-01'))[6] +(dataset %>% filter (location =='Germany', date=='2021-07-01'))[6]
#difference between the actual and predicted
abs(2337 - 7650.1)
## [1] 5313.1

3f). Modeling - US only 2021 data (extra)

US2 <- US %>% filter (location =='United States', date>= '2021-01-01',date<='2021-06-01')
ggplot(data= US2, aes(date,new_cases)) + geom_line()

ts_US2 <- ts(US2$new_cases,frequency =7)
ts_US2
## Time Series:
## Start = c(1, 1) 
## End = c(22, 5) 
## Frequency = 7 
##   [1] 153916 300462 208853 184005 235042 255637 278337 295257 260967 213415
##  [11] 214664 226967 230301 235766 242780 201858 177931 143598 176216 183261
##  [21] 193856 190760 170759 131198 151677 147626 153961 168804 166613 142459
##  [31] 112152 134975 115303 121691 124006 134422 104176  89746  90438  95265
##  [41]  95250 105764  99670  87219  65135  54279  62498  70139  69911  79282
##  [51]  71696  57152  56159  72270  74749  77504  77349  64626  51422  58098
##  [61]  57098  67191  68060  66419  58254  41073  44917  57667  57895  62471
##  [71]  61513  53031  38278  56541  54008  59280  60375  61651  55519  33822
##  [81]  51436  53688  86938  67546  77377  62842  43223  69273  61429  66765
##  [91]  79119  69887  63261  35133  77404  60674  75021  79894  82710  66687
## [101]  46509  70075  77966  75403  74308  80071  52532  42121  68105  60949
## [111]  62855  67278  62411  53495  32153  47568  50836  55150  58251  57919
## [121]  45391  29403  50491  40723  44704  47557  48130  33675  21427  36798
## [131]  33662  35826  38076  42260  28857  16875  28621  27789  29301  30206
## [141]  27946  19798  12868  25815  22739  23976  27448  21859  12001   6734
## [151]   5775  22940
decompose_US <- decompose(ts_US2, "additive")
plot(decompose_US)

acf(ts_US2)

pacf(ts_US2)

model_US2 <- auto.arima(ts_US2, ic="aic", trace =TRUE, approximation = F)
## 
##  ARIMA(2,1,2)(1,0,1)[7] with drift         : Inf
##  ARIMA(0,1,0)           with drift         : 3436.874
##  ARIMA(1,1,0)(1,0,0)[7] with drift         : 3378.737
##  ARIMA(0,1,1)(0,0,1)[7] with drift         : 3387.647
##  ARIMA(0,1,0)                              : 3435.133
##  ARIMA(1,1,0)           with drift         : 3429.823
##  ARIMA(1,1,0)(2,0,0)[7] with drift         : 3371.47
##  ARIMA(1,1,0)(2,0,1)[7] with drift         : 3373.47
##  ARIMA(1,1,0)(1,0,1)[7] with drift         : 3373.713
##  ARIMA(0,1,0)(2,0,0)[7] with drift         : 3411.079
##  ARIMA(2,1,0)(2,0,0)[7] with drift         : 3362.669
##  ARIMA(2,1,0)(1,0,0)[7] with drift         : 3366.716
##  ARIMA(2,1,0)(2,0,1)[7] with drift         : 3363.188
##  ARIMA(2,1,0)(1,0,1)[7] with drift         : Inf
##  ARIMA(3,1,0)(2,0,0)[7] with drift         : 3364.648
##  ARIMA(2,1,1)(2,0,0)[7] with drift         : 3364.656
##  ARIMA(1,1,1)(2,0,0)[7] with drift         : 3365.16
##  ARIMA(3,1,1)(2,0,0)[7] with drift         : 3366.179
##  ARIMA(2,1,0)(2,0,0)[7]                    : 3360.707
##  ARIMA(2,1,0)(1,0,0)[7]                    : 3364.741
##  ARIMA(2,1,0)(2,0,1)[7]                    : 3361.256
##  ARIMA(2,1,0)(1,0,1)[7]                    : Inf
##  ARIMA(1,1,0)(2,0,0)[7]                    : 3369.485
##  ARIMA(3,1,0)(2,0,0)[7]                    : 3362.683
##  ARIMA(2,1,1)(2,0,0)[7]                    : 3362.692
##  ARIMA(1,1,1)(2,0,0)[7]                    : 3363.236
##  ARIMA(3,1,1)(2,0,0)[7]                    : 3364.184
## 
##  Best model: ARIMA(2,1,0)(2,0,0)[7]
model_US2
## Series: ts_US2 
## ARIMA(2,1,0)(2,0,0)[7] 
## 
## Coefficients:
##           ar1      ar2    sar1    sar2
##       -0.9437  -0.4262  0.5047  0.3454
## s.e.   0.1223   0.1272  0.1266  0.1369
## 
## sigma^2 estimated as 245647046:  log likelihood=-1675.35
## AIC=3360.71   AICc=3361.12   BIC=3375.79
acf(ts(model_US2$residuals))

pacf(ts(model_US2$residuals))

forecast_US2<-forecast(model_US2, h=4, level=c(95))
forecast_US2
##          Point Forecast     Lo 95    Hi 95
## 22.71429      11074.415 -19644.36 41793.19
## 22.85714      17318.882 -13448.53 48086.29
## 23.00000      15319.137 -19357.15 49995.43
## 23.14286       4236.653 -33500.57 41973.87
plot(forecast_US2)

summary(forecast_US2)
## 
## Forecast method: ARIMA(2,1,0)(2,0,0)[7]
## 
## Model Information:
## Series: ts_US2 
## ARIMA(2,1,0)(2,0,0)[7] 
## 
## Coefficients:
##           ar1      ar2    sar1    sar2
##       -0.9437  -0.4262  0.5047  0.3454
## s.e.   0.1223   0.1272  0.1266  0.1369
## 
## sigma^2 estimated as 245647046:  log likelihood=-1675.35
## AIC=3360.71   AICc=3361.12   BIC=3375.79
## 
## Error measures:
##                     ME     RMSE     MAE       MPE     MAPE      MASE      ACF1
## Training set -886.8403 15413.19 8651.75 -2.113732 10.95465 0.5474379 0.1816099
## 
## Forecasts:
##          Point Forecast     Lo 95    Hi 95
## 22.71429      11074.415 -19644.36 41793.19
## 22.85714      17318.882 -13448.53 48086.29
## 23.00000      15319.137 -19357.15 49995.43
## 23.14286       4236.653 -33500.57 41973.87
accuracy(forecast_US2)
##                     ME     RMSE     MAE       MPE     MAPE      MASE      ACF1
## Training set -886.8403 15413.19 8651.75 -2.113732 10.95465 0.5474379 0.1816099
#difference between the actual and predicted
abs(14463 - 4236.653)
## [1] 10226.35

.

.

.

.

.

.

.

.